*Kaggle Expedia data
*******************************************************************************************************************************
*************************************Table 3, Table S.6, Figure S.5************************************************************
*******************************************************************************************************************************
clear
import delimited train.csv
*9917530 obs at first

drop comp* prop_location_score2 srch_query_affinity_score site_id orig_destination_distance

rename visitor_location_country_id visCountryId
rename visitor_hist_starrating visHistStars
rename visitor_hist_adr_usd visHistPrice
rename prop_country_id hotelCountryId
rename prop_id hotelId
rename prop_starrating hotelStars
rename prop_review_score hotelReview
rename prop_brand_bool brand
rename prop_location_score1 hotelLocationScore1
rename prop_log_historical_price hotelHistPrice
rename price_usd price
rename promotion_flag promo
rename srch_destination_id destinationId
rename srch_length_of_stay tripLength
rename srch_booking_window searchWindow
rename srch_adults_count reqAdult
rename srch_children_count reqChild
rename srch_room_count reqRoomNum
rename srch_saturday_night_bool satNight
rename random_bool rand
rename click_bool click
rename gross_bookings_usd grossSpent
rename booking_bool tran

*replace nulls
replace hotelReview="" if hotelReview=="NULL"
replace visHistStars="" if visHistStars=="NULL"
replace visHistPrice="" if visHistPrice=="NULL" 
replace grossSpent="" if grossSpent=="NULL"  
replace hotelHistPrice=. if hotelHistPrice==1
replace hotelStars=. if hotelStars==0

*destring
destring hotelReview, replace 
destring grossSpent, replace 
destring visHistStars, replace
destring visHistPrice, replace
replace visHistPrice=. if visHistPrice==0

*de-log historical price
replace hotelHistPrice=exp(hotelHistPrice)

*CLEANING
*delete searches with prices <10, >1000
sort srch_id price
by srch_id, sort: gen g=1 if (price<10|price>1000)
replace g=0 if g==.
by srch_id, sort: egen h=total(g)
drop if h!=0
drop g h
unique srch_id 
*385822 search id left

*gen implied tax from price, num of rooms, number of nights and total spent
gen tax=(grossSpent/(tripLength*reqRoomNum))-price
*drop if implied tax is greater than 30% or too small
by srch_id, sort: gen gap=1 if ((tax!=.&tax<1)|(tax!=.&tax>0.3*price))
replace gap=0 if gap==.
by srch_id, sort: egen h=total(gap)
drop if h!=0
drop h gap
unique srch_id
*308034 search id left 
drop tax grossSpent

*labeling
label var price "Price"
label var promo "Promotion"
label var destinationId "Destination ID"
label var tripLength "Trip Length (days)"
label var searchWindow "Booking Window (days)"
label var reqAdult "Adults"
label var reqChild "Children"
label var reqRoomNum "Rooms"
label var satNight "Saturday Night"
label var rand "Random Ranking"
label var click "Click"
label var tran "Transaction"
label var visCountryId "Consumer Country ID"
label var visHistStars "Consumer Hist. Stars"
label var visHistPrice "Consumer Hist. Price"
label var hotelCountryId "Hotel Country ID"
label var hotelId "Hotel ID"
label var hotelStars "Stars"
label var hotelReview "Review Score"
label var brand "Chain"
label var hotelLocationScore1 "Location Score"
label var hotelHistPrice "Hotel Hist. Price"
label var position "Position"


*drop opaque offers
gen n=1 if (position==5|position==11|position==17|position==23|position>37)
by srch_id, sort: egen nn=total(n)
drop if nn>0
unique srch_id 
*268997 choice sets

*drop infrequent destinations/unpopulated impressions
*number of searches by destination
by srch_id, sort: gen h=1 if _n==1
tostring destinationId, gen(dest)
tostring rand, gen(sth)
gen var=dest+sth
by var, sort: egen gap=sum(h)
*need enough instantiations of the same ranking (>=50 searches)
keep if gap>=50
drop h gap var n nn
*136266 choice sets

*keep observations where transactions happen in top10 positions 
gen tranposit=position if tran==1
gen clickposit=position if click==1
bysort srch_id:egen maxtran=max(tranposit)
replace maxtran=0 if maxtran==.
bysort srch_id:egen maxclick=max(clickposit)
replace maxclick=0 if maxclick==.
drop if maxtran==0| maxtran>12

*keep choice set with 10 options
keep if position<13
sort srch_id position
bysort srch_id: gen AltID=_n 
bysort srch_id: egen maxalt=max(AltID)
keep if maxalt==10 
*54648 choice sets

*********************heter 219 vs others 
gen country=(visCountryId==219)

drop clickposit tranposit maxclick maxtran rand sth satNight reqRoomNum reqChild reqAdult searchWindow tripLength dest date_time visCountryId visHistStars visHistPrice hotelCountryId hotelId hotelHistPrice destinationId maxalt

*replace srch_id with choice set id
preserve 
keep srch_id 
duplicates drop 
sort srch_id 
gen ID=_n
tempfile ID 
save `ID',replace
restore 

merge m:1 srch_id using `ID'
drop _merge 
rename AltID altID 
save flexlogitfull,replace 



********************************************************************************************************
*************************************Table 3************************************************************
********************************************************************************************************
*summary stats Table 3
use flexlogitfull,clear 
replace hotelStar=0 if hotelStar==. 
replace hotelReview=0 if hotelReview==.
eststo: quietly estpost sum price hotelStars hotelReview brand hotelLocationScore1 promo , detail  
esttab using "sumstat.tex", cells("count(label(Observations) fmt(0)) mean(label(Mean) fmt(2)) p50(label(Median) fmt(2)) sd(label(SD) fmt(2)) min(label(Min) fmt(0)) max(label(Max) fmt(0))") label replace noobs  nomtitles booktabs nonumbers title(Summary Statistics \label{sumstat})
eststo clear




*********************************************************************************************************
*************************************Table S.6, Figure S.5***********************************************
*********************************************************************************************************
 
 *histogram of clicks conditional on search good 1-10 
 use flexlogitfull,clear
 set seed 600
 gen random=runiform()
 sort ID hotelLocationScore1 random
 bys ID:gen i=_n
 replace i=10-i+1 
 forvalues j=1/10{
 gen click`j'=(click==1&i==`j')
 }
 bys ID:egen sumclick=sum(click)
  forvalues j=1/10{
  bys ID:egen maxclick`j'=max(click`j')
   }
 keep ID sumclick maxclick* 
 duplicates drop

 *********************************************************************************************************
*************************************Figure S.5***********************************************
*********************************************************************************************************
  forvalues j=1/10{
 histogram sumclick if maxclick`j'>0, xtitle("Number of Clicks")
 graph export "click`j'.png",replace
 }
*********************************************************************************************************
*************************************Table S.6***********************************************
********************************************************************************************************* 
 * Initialize a matrix to store the summary stats
matrix results = J(10, 3, .) // 10 rows, 3 columns: mean, sd, N

* Loop through j = 1 to 10
forvalues j = 1/10 {
    quietly summarize sumclick if maxclick`j' > 0
    matrix results[`j', 1] = r(mean)
    matrix results[`j', 2] = r(sd)
    matrix results[`j', 3] = r(N)
}

* Create a dataset from the matrix
clear
set obs 10
gen j = _n
gen mean = .
gen sd = .
gen N = .

forvalues i = 1/10 {
    replace mean = results[`i', 1] in `i'
    replace sd   = results[`i', 2] in `i'
    replace N    = results[`i', 3] in `i'
}

rename (j mean sd N)(Searched_Good Mean_Number_of_Clicks Standard_Deviation Number_of_Consumers)
 


********************************************prepare samples for flexible logit estimations**********

use flexlogitfull,clear

rename price x1 
rename hotelStars x2 
rename brand x3 
rename position x4
rename promo x5 
rename hotelReview x6 
rename hotelLocationScore z
drop click srch_id 
rename tran chosen
forvalues i=1/6{
replace x`i'=0 if x`i'==.
}

replace z=0 if z==.

  rename x4 position
  tab position,gen(x6)
  drop x610
  rename x6 x4 
  rename x3 brand 
  rename x4 x3
  rename brand x4 
 rename (x61 x62 x63 x64 x65 x66 x67 x68 x69) (x6 x7 x8 x9 x10 x11 x12 x13 x14)
 drop position  
 tempfile flexlogit_posdummy
save `flexlogit_posdummy',replace
  
  drop country 
  save flexlogit,replace
 
 ***********************create subsample for heterogeneity analysis
use `flexlogit_posdummy',clear
 keep if country==1
 drop country
 save flexlogit219,replace 
 
 use `flexlogit_posdummy',clear
 keep if country==0
 drop country
 save flexlogitother,replace
 
 
 
********************************************************************************************************** 
*************************************Standard logit results in Table S.4 (before normalization)**********
*********************************************************************************************************
 use flexlogit,clear
 clogit chosen x* z, group(ID)
*price coefficient from standard logit 
*star coefficient from standard logit 
*review coefficient from standard logit 
*location coefficient from standard logit 
  
  

 



 
 
 
 
 
 
 
 
 
 